################################################################################
#                                                                              #
# TITLE        : BREAKING SILENCE: HOW INTIMATE PARTNER VIOLENCE AND           #         
#                REPORTING SHAPE LATER LIFE OUTCOMES                           #        
# CODE EDITOR  : Harrison Chang                                                #
# LAST MODIFIED: 2025/07/18                                                    #
# PURPOSE      : This file generates summary statistics                        #
#                                                                              #
#                                                                              #
#                                                                              #
#                                                                              #
################################################################################

rm(list=ls());gc()

library(data.table)
library(tidyverse)
library(dplyr)
library(fixest) 
library(ggplot2)
library(zoo)

Disk                  <- "E"

source(paste0(Disk, ":/H112057/arrow/arrow_20231023/arrow_helpers.R"))


###################################################
#                       Setting                   #
###################################################

setwd(paste0(Disk, ":/H112057/H112057_Harrison/DV"))

output.dir            <- paste0(Disk, ":/H112057/H112057_Harrison/DV/processed_data")
par.dir               <- paste0(Disk, ":/H112057/parquet")
csv.dir               <- paste0(Disk, ":/H112057/data")
result.dir            <- paste0(Disk, ":/H112057/H112057_Harrison/DV/results")
CleanData.dir         <- paste0(Disk, ":/H112057/CleanData/processed_data")
admin_edu.path        <- paste0(CleanData.dir, "/edu/highest_educ.parquet") 

gc()



# # # # # # # # # # # # # # # # # # # # # # # # # # # # # 
####        Table 1: General Summary Statistics      ####
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # 


sample_selection      <- c("IPVf_once")
sample_year           <- c(2012L:2018L)  
data_year             <- c(2003L:2018L) 

data                  <- open_dataset("processed_data/data/IPV_victim_general_panel.parquet") %>%
                         collect() %>%
                         as.data.table()

data                  <- data[complete_marr_dummy == 1]
unique(data, by = "ID")[, .(count = .N), by = .(id_sex, repeated)]

dt_sample             <- data[ event_year %in% sample_year][ year %in% data_year]

rm(data);gc()


if ( sample_selection == "IPVf_once"){
  dt_sample           <- dt_sample[id_sex==0 & repeated == 0]
}
if ( sample_selection == "IPVf_repeat"){
  dt_sample           <- dt_sample[id_sex==0 & repeated == 1]
}
if ( sample_selection == "IPVm_once"){
  dt_sample           <- dt_sample[id_sex==1 & repeated == 0]
}

unique(dt_sample, by = "ID") %>% nrow()

#To collapse further, we want only ONE observation per person, i.e. 90604 obs a year before reporting
dt_sample[, relative_year := year - event_year]
dt_sample[, wage := TRUE_AMT/FTWORKING]
cross_section         <- dt_sample[relative_year==-1][!duplicated(ID)]

cross_section[, .N, .(VIOLENCETYPE)]
cross_section[, type_missing        := ifelse(VIOLENCETYPE=="",1,0)]
cross_section[, type_physical       := ifelse( grepl(c("A"),         VIOLENCETYPE) , 1, 0) ]
cross_section[, type_mental         := ifelse( grepl(c("B"),         VIOLENCETYPE) , 1, 0) ]



#####Panel A
#age
export                <- cross_section[, .N, .(vic_age)][order(vic_age)]
cross_section %>% nrow()
mean(cross_section$vic_age)

cross_section[, .N, .(INFOUNIT)][order(N)]
export_source_D       <- cross_section[INFOUNIT=="D", .N, .(vic_age)][order(vic_age)]
export_source_C       <- cross_section[INFOUNIT=="C", .N, .(vic_age)][order(vic_age)]
export_source_I       <- cross_section[INFOUNIT=="I", .N, .(vic_age)][order(vic_age)]


#admin edu
cross_section[, .N, .(highest_edu)][order(highest_edu)]
mean(cross_section[highest_edu!=0]$highest_edu)
#depression outpatient visit
cross_section[, .N, .(depression_dummy)][order(depression_dummy)]
mean(cross_section$depression_dummy)
#max pre-reporting income (since year -10)
summary(cross_section$max_pre_income)



#####Panel B
#Open case or not
cross_section[, .N, .(ISOPENCASE)]
cross_section[, isopen := ifelse(ISOPENCASE=="Y" & is.na(ISOPENCASE)==0,1,0)]
mean(cross_section$isopen)
#Violence type
mean(cross_section[type_missing==0]$type_physical)
mean(cross_section[type_missing==0]$type_mental)

#Severity Index
cross_section[, daeval01 := ifelse( DAEVALITEM01A!= "" , 1, 0)]
cross_section[, daeval04 := ifelse( DAEVALITEM04 != "Y", 0, 1)]
cross_section[, daeval05 := ifelse( DAEVALITEM05 != "Y", 0, 1)]
cross_section[, daeval06 := ifelse( DAEVALITEM06 != "Y", 0, 1)]
cross_section[, daeval07 := ifelse( DAEVALITEM07 != "Y", 0, 1)]
cross_section[, daeval08 := ifelse( DAEVALITEM08 != "Y", 0, 1)]
cross_section[, daeval09 := ifelse( DAEVALITEM09 != "Y", 0, 1)]
cross_section[, daeval10 := ifelse( DAEVALITEM10 != "Y", 0, 1)]
cross_section[, daeval11 := ifelse( DAEVALITEM11 != "Y", 0, 1)]
cross_section[, daeval12 := ifelse( DAEVALITEM12 != "Y", 0, 1)]
cross_section[, daeval13 := ifelse( DAEVALITEM13 != "Y", 0, 1)]
cross_section[, daeval14 := ifelse( DAEVALITEM14 != "Y", 0, 1)]
cross_section[, daeval15 := ifelse( DAEVALITEM15 != "Y", 0, 1)]

cross_section[, severity_index := daeval01 + daeval04 + daeval05 + daeval06 +
                daeval07 + daeval08 + daeval09 + daeval10 +
                daeval11 + daeval12 + daeval13 + daeval14 + daeval15 ]

cross_section[, .N, .(severity_index)][order(severity_index)]

mean(cross_section[severity_index>0]$severity_index)



#####Panel D
#How many IPVf_once victim are with kids under 12
cross_section[, `:=` (child_1_age = event_year - child_1_birth_year,
                      child_2_age = event_year - child_2_birth_year,
                      child_3_age = event_year - child_3_birth_year,
                      child_4_age = event_year - child_4_birth_year,
                      child_5_age = event_year - child_5_birth_year)]

cross_section[, num_children_under_12 := rowSums(.SD < 12, na.rm = TRUE), .SDcols = paste0("child_", 1:5, "_age")]
cross_section[, .N, .(num_children_under_12)][order(num_children_under_12)]

#Percentage of number of kids under 12
cross_section[, mean(num_children_under_12 > 0)]
#If yes, Number of kids under 12
cross_section[num_children_under_12 > 0, mean(num_children_under_12)]
#marital status
cross_section[, .N, .(MARR)][order(MARR)]





#####Panel E
#mean wage
cross_section[, .N, .(wage)][order(wage)]
mean(cross_section[wage>2000]$wage)
#occupation
cross_section[, .N, .(occupation)][order(occupation)]



mean(cross_section$max_pre_income)




#####Panel F
#reporting source (C: Hospital + D: Police + I: 113 )
cross_section[, .N, .(INFOUNIT)][order(N)]

rm(cross_section,export,dt_sample);gc()




# # # # # # # # # # # # # # # # # # # # # # # # # # # # # 
#### Table B.5: Summary Statistics By Report Channel ####
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # 

#source variable, D:Police, C:Hospital, I:113
dt_sample[,.N,.(INFOUNIT)][order(N)]
dt_sample[INFOUNIT   == "D", infounit   := 1]
dt_sample[INFOUNIT   == "C", infounit   := 2]
dt_sample[INFOUNIT   == "I", infounit   := 3]
dt_sample[is.na(infounit)==1,infounit   := 4]
dt_sample[,.N,.(infounit)][order(infounit)]

#To collapse further, we want only ONE observation per person, i.e. 80723 obs a year before reporting
dt_sample[, relative_year := year - event_year]
cross_section       <- dt_sample[relative_year==-1][!duplicated(ID)]

cross_section[,.N,.(infounit)][order(infounit)]


#age
mean(cross_section[infounit==1]$vic_age)
mean(cross_section[infounit==2]$vic_age)
mean(cross_section[infounit==3]$vic_age)
mean(cross_section[infounit==4]$vic_age)

#For all the panels, one can use the code in Table 1 and with the condition 
#that infounit==1, infounit==2, infounit==3, infounit==4






# # # # # # # # # # # # # # # # # # # # # # # # # # # # # 
####  Table B.9: Summary Statistics By Survey Status ####
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # 


violence_time         <- open_dataset("processed_data/data/2012_2018/IPV_violence_year.parquet") %>%
                         collect() %>%
                         as.data.table()

violence_time[, .N, .(duration_year)][order(duration_year)]
violence_time[, violence_year := report_year - duration_year]

#Of all the 90604 people, only 57075 answers duration, and 9yrs or under, and 9396 answers ten or more years
dt_sample             <- violence_time[dt_sample, on = "ID"]
unique(dt_sample, by = "ID")[, .N, .(duration_year)][order(duration_year)]
unique(dt_sample, by = "ID") %>% nrow()
unique(dt_sample[is.na(duration_year)==0], by = "ID") %>% nrow()


#To collapse further, we want only ONE observation per person, i.e. 90604 obs a year before reporting
dt_sample[, relative_year := year - event_year]
dt_sample[, wage := TRUE_AMT/FTWORKING]
cross_section         <- dt_sample[relative_year==-1][!duplicated(ID)]

#relative_year is the year before reporting
cross_section[, survey := ifelse(is.na(duration_year)==1,0,1)]
cross_section[, .N, .(survey)][order(survey)]


#######Panel A
#age
mean(cross_section[survey==0]$vic_age)
mean(cross_section[survey==1]$vic_age)

#For all the panels, one can use the code in Table 1 and with the condition 
#that survey==0 or survey==1





